####CLEAN INSPECTION DATA####

#Get vehicle registration file names and paths
filenames_reg<-list.files(pattern="Geconstateerde_Gebreken.*?.RData")
filedate_reg<-str_match(filenames_reg, "Geconstateerde_Gebreken_(.*?).RData" )[,2]
filedate_reg<-gsub("[Jj]anuari", "january",filedate_reg )
filedate_reg<-gsub("[Ff]ebruari", "february",filedate_reg )
filedate_reg<-gsub("[Mm]aart", "march",filedate_reg )
filedate_reg<-gsub("[Mm]ei", "may",filedate_reg )
filedate_reg<-gsub("[Jj]uni", "june",filedate_reg )
filedate_reg<-gsub("[Jj]uli", "july",filedate_reg )
filedate_reg<-gsub("[Aa]ugustus", "august",filedate_reg )
filedate_reg<-gsub("[Oo]ktober", "october",filedate_reg )
Files_reg<-cbind.data.frame(filenames_reg,as.Date(dmy(filedate_reg)),stringsAsFactors=FALSE)
colnames(Files_reg)<-c("path", "date")
Files_reg<- Files_reg[order(Files_reg$date),]
Files_reg$order<-order(Files_reg$date)
rm(filedate_reg, filenames_reg)

#Get only the earliest file each month
Files_reg$year<-year(Files_reg$date)
Files_reg$month<-month(Files_reg$date)
Files_reg$day<-day(Files_reg$date)
Temp<-aggregate(day ~ year + month, data=Files_reg, FUN="min")
Temp$dateok<-as.Date(paste(Temp$day,Temp$month,Temp$year,sep="/"), format="%d/%m/%Y")
Files_reg<-subset(Files_reg, date %in% Temp$dateok)
rm(Temp)

load(Files_reg$path[1])



#Save raw file (this passage is for technical reasons only)
for(d in 1:20) {

load(Files_reg$path[d])

#Drop dots from variable names
names(x)<-gsub("\\.", "",names(x))
#All lower case
names(x)<-tolower(names(x))
#Transform factor in character
i <- sapply(x, is.factor)
x[i] <- lapply(x[i], as.character)
  
save(x, file=paste("APK_",d,".RData",sep=""))
}

rm(list=ls()); gc()
objects()




load("APK_1.RData")
x1<-x
load("APK_2.RData")
x<-rbind(x1,x)
x$dupl<-duplicated(cbind(x$kenteken, x$melddatumdoorkeuringsinstantie, x$gebrekidentificatie))
table(x$dupl)
x<-subset(x, dupl=="FALSE") 
rm(x1)
x$dupl<-NULL

for(d in 3:20) {
  print(paste("Iteration n.", d))
x1<-x
load(paste("APK_",d,".RData", sep=""))
x<-rbind(x1,x)
rm(x1)
x$dupl<-duplicated(cbind(x$kenteken, x$melddatumdoorkeuringsinstantie, x$gebrekidentificatie))
table(x$dupl)
x<-subset(x, dupl=="FALSE") 
x$dupl<-NULL
}

save(x, file="Merged_APK.RData")

load("Merged_APK.RData")

load("Panel_Full.RData")

x$num<-1

APKtot<-aggregate(num ~ kenteken + melddatumdoorkeuringsinstantie, data=x, FUN="sum")

APKtot$melddatumdoorkeuringsinstantie<-as.Date(as.character(APKtot$melddatumdoorkeuringsinstantie), format="%Y%m%d")

APKtot$APKtot<-APKtot$num
APKtot$num<-NULL

dates<-as.Date(unique(as.character(Panel$date)), format="%Y-%m-%d")
dates<-dates[order(dates)]

APKtot$datediff<-as.numeric(dates[1]-APKtot$melddatumdoorkeuringsinstantie)

Temp<-subset(APKtot, datediff>0)
Mindate<-aggregate(datediff ~ kenteken, data=Temp, FUN="min")
Temp<-merge(Temp, Mindate, by="kenteken", all.x=TRUE, all.y=FALSE)
Temp<-subset(Temp, datediff.x==datediff.y)
Temp$datediff.x<-NULL
Temp$datediff.y<-NULL
Temp$datediff<-NULL
Temp$melddatumdoorkeuringsinstantie<-NULL
Temp$date<-dates[1]
APKtot$datediff<-NULL
APKtot_recent<-Temp
rm(Temp, Mindate)

for(d in 2:18) {
APKtot$datediff<-as.numeric(dates[d]-APKtot$melddatumdoorkeuringsinstantie)
Temp<-subset(APKtot, datediff>0)
if(nrow(Temp)>0) {
Mindate<-aggregate(datediff ~ kenteken, data=Temp, FUN="min")
Temp<-merge(Temp, Mindate, by="kenteken", all.x=TRUE, all.y=FALSE)
Temp<-subset(Temp, datediff.x==datediff.y)
Temp$datediff.x<-NULL
Temp$datediff.y<-NULL
Temp$datediff<-NULL
Temp$melddatumdoorkeuringsinstantie<-NULL
Temp$date<-dates[d]
APKtot$datediff<-NULL
APKtot_recent<-rbind(APKtot_recent, Temp)
rm(Temp)
}
}

save(APKtot_recent, file="APKtot_recent.RData")


